import requests
import csv
from bs4 import BeautifulSoup
import re

#定义寻找特征字符位置的函数
def find_en(st, text):
    position = []
    n = 0
    for m in range(0, 1000):
        pos = body.find(st, n, len(text))
        n = pos + 1
        position.append(pos)
        if position[m] == -1:
            break
    return position

#定义修正中英文冒号的函数，将中文冒号的QA插入
def correct(datalist1, datalist2):
    if len(datalist1) != 1:
        for m in range(0, len(datalist1)-1):
            for n in range(0, len(datalist2)-1):
                if datalist2[n] > datalist1[m]:
                    datalist2.insert(n, datalist1[m])
                    break

#去除可能出现在A中的小标题
def delete(list1):
    for m in range(0, len(list1)):
        l = list1[m].find('\n\n\n\n')
        if l != -1:
            list1[m] = list1[m][0:l + 1]


#定义列表和特征字符
url = ['https://mindspore.cn/docs/zh-CN/master/faq/installation.html', 'https://mindspore.cn/docs/zh-CN/master/faq/data_processing.html', 'https://mindspore.cn/docs/zh-CN/master/faq/implement_problem.html', 'https://mindspore.cn/docs/zh-CN/master/faq/network_compilation.html', 'https://mindspore.cn/docs/zh-CN/master/faq/operators_compile.html', 'https://mindspore.cn/docs/zh-CN/master/faq/usage_migrate_3rd.html', 'https://mindspore.cn/docs/zh-CN/master/faq/performance_tuning.html', 'https://mindspore.cn/docs/zh-CN/master/faq/precision_tuning.html', 'https://mindspore.cn/docs/zh-CN/master/faq/distributed_configure.html', 'https://mindspore.cn/docs/zh-CN/master/faq/inference.html', 'https://mindspore.cn/docs/zh-CN/master/faq/feature_advice.html']
process = ['#安装', '#数据处理', '#执行问题', '#网络编译', '#算子编译', '#第三方框架迁移使用', '#性能调优', '#精度调优', '#分布式配置', '#推理', '#特性咨询']
process1 = ['安装', '数据处理', '执行问题', '编译', '编译', '第三方框架迁移', '性能调优', '精度调优', '分布式配置', '推理', '特性咨询']
tag = ['GPU', 'CuBLAS库', 'Linux', 'arm', 'macOS', 'SDK版本', 'SciPy', 'whl包', 'protobuf', 'Ubuntu', 'MindInsight', 'MIndArmour', 'CPU', 'Windows', 'WSL', 'Ascend', 'Conda', 'Serving', 'gmp', 'cuda', 'MindIR', 'API', 'MindRecord', 'JupyterLab', 'Generator Dataset', 'MindData', 'pipeline', 'Datalouder', 'Dataset', 'eval', 'Model', 'SGD', 'loss', 'PyTorch', 'NLP', 'ModelZoo', 'HCLL', 'ModelArts', 'PyNative', 'Graph', 'TensorFlow', 'C++', 'AIPP', 'OpenMPI', 'NCLL', 'RDMA', 'IB', 'RoCE', 'taichi', 'Caffe', 'NPU']
tag_change = []
str1 = "Q:"
str2 = "A:"
str3 = "Q："
str4 = "A："
qa = []
#转义tag中的特殊符号
for i in tag:
    tag_change.append(re.escape(i))
#十一个语料库大循环
for numbers in range(0, len(url)):
    #获取网页文本
    req = requests.get(url=url[numbers])
    req.encoding = 'utf-8'
    html = req.text
    bs = BeautifulSoup(html, "html.parser")#html.parser是解析器
    result = bs.select(process[numbers])
    body = result[0].get_text()
    #定义存放特征值位置的列表，其中i1、j1存放英文冒号的QA，wq1、wq2存放中文冒号的QA
    question_en = []
    answer_en = []
    wrong_question_en = []
    wrong_answer_en = []
    #开始查找，得到位置的列表
    question_en = find_en(str1, body)
    answer_en = find_en(str2, body)
    wrong_question_en = find_en(str3, body)
    wrong_answer_en = find_en(str4, body)
    #如果找到存在中文冒号的Q和A，将其插入
    correct(wrong_question_en, question_en)
    correct(wrong_answer_en, answer_en)
    #在完成上述补充后，还存在不对齐的现象，断定为缺少'A'字符
    if len(question_en) != len(answer_en):
        #print('存在缺少A的现象')
        for m in range(0, len(question_en)-2):
            if question_en[m+1] < answer_en[m]:
                a = body.find('？', answer_en[m-1]+1, len(body))
                answer_en.insert(m-1, a)
    #通过特征位置构造存放QA的列表
    question = []
    answer = []
    for m in range(0, len(question_en)-1):
        str5 = body[question_en[m]:answer_en[m]]
        question.append(str5)
    for n in range(0, len(answer_en)-2):
        str6 = body[answer_en[n]:question_en[n+1]]
        answer.append(str6)
    #最后的A一直到文本结束
    last = len(answer_en)-2
    str7 = body[answer_en[last]:]
    answer.append(str7)
    #在A中可能会出现小标题，进行去除
    delete(answer)
    #进行不区分大小写的查找tag
    tag1 = []
    tag2 = []
    for m in range(0, len(question)):
        str8 = question[m]+answer[m]
        for i in tag_change:
            t = re.search(i, str8, flags=re.IGNORECASE)
            if t != None:
                tag2.append(t.group(0))
        tag1.append(tag2)
        tag2 = []
    #将列表元素合并为字符串
    FinalTag = []
    for m in range(0, len(question)):
        ft_str = '、'.join(tag1[m])
        FinalTag.append(ft_str)
    #将各个FAQ元素合并，用于写入csv文件
    for n in range(0, len(question)):
        qa.append([question[n], url[numbers],  answer[n], process1[numbers], FinalTag[n]])

#写入数据
headers = ('title', 'link', 'content', 'process', 'tag')
with open('dataset.csv', 'w', encoding='utf-8', newline='') as f:
    write = csv.writer(f)  # 创建writer对象
    write.writerow(headers)
    for n in range(len(qa)):
        write.writerow(qa[n])